def Preprocess():
    from gensim.utils import simple_preprocess
    from gensim.parsing.preprocessing import STOPWORDS
    fw = open('corpus.tsv','w')
    fr = open('document.tsv','r')
    fr.readline()
    fw.write('LINKID\tROOT\tWORDS\tPHRASES\n')
    for line in fr:
        arr = line.strip('\r\n').split('\t')
        s = arr[0]+'\t'+arr[1]+'\t'
        sWord,sPhrase = '',''
        for token in simple_preprocess(arr[2]):
            if not token in STOPWORDS and len(token) > 3:
                sWord += ' '+token
        for token in arr[3].split(' '):
            if not token == '':
                sPhrase += ' '+token
        if not sWord == '': s += sWord[1:]
        s += '\t'
        if not sPhrase == '': s += sPhrase[1:]
        fw.write(s+'\n')
    fr.close()
    fw.close()

def LoadCorpus(ifPost,ifResponse,ifWord,ifPhrase):
    ret = []
    fr = open('corpus.tsv','r')
    fr.readline()
    for line in fr:
        arr = line.strip('\r\n').split('\t')
        if (ifPost and arr[1] == 'Y') or (ifResponse and arr[1] == 'N'):
            doc = []            
            if ifWord and not arr[2] == '':
                for token in arr[2].split(' '):
                    doc.append(token)
            if ifPhrase and not arr[3] == '':
                for token in arr[3].split(' '):
                    doc.append(token)
            ret.append(doc)
    fr.close()
    return ret

def CompressCorpus(corpus,vocabSize):
    ret = []
    word2count,phrase2count = {},{}
    for doc in corpus:
        for token in doc:
            if '_' in token:
                if not token in phrase2count:
                    phrase2count[token] = 0
                phrase2count[token] += 1
            else:
                if not token in word2count:
                    word2count[token] = 0
                word2count[token] += 1
    word_count = sorted(word2count.items(),key=lambda x:-x[1])
    phrase_count = sorted(phrase2count.items(),key=lambda x:-x[1])
    token2count = {}
    for token,count in word_count[:min(len(word_count),vocabSize)]:
        token2count[token] = count
    for token,count in phrase_count[:min(len(phrase_count),vocabSize)]:
        token2count[token] = count
    token_count = sorted(token2count.items(),key=lambda x:-x[1])
    vocab = set([token for token,count in token_count])
    for doc in corpus:
        _doc = []
        for token in doc:
            if token in vocab:
                _doc.append(token)
        if len(_doc) > 0:
            ret.append(_doc)
    return ret

def LoadPairCorpus(ifWord,ifPhrase):
    ret = []
    id2postresponses = {}
    fr = open('corpus.tsv','r')
    fr.readline()
    for line in fr:
        arr = line.strip('\r\n').split('\t')
        _id = arr[0]
        if not _id in id2postresponses:
            id2postresponses[_id] = [[],[]]
        doc = []            
        if ifWord and not arr[2] == '':
            for token in arr[2].split(' '):
                doc.append(token)
        if ifPhrase and not arr[3] == '':
            for token in arr[3].split(' '):
                doc.append(token)
        if arr[1] == 'Y':
            id2postresponses[_id][0].append(doc)
        if arr[1] == 'N':
            id2postresponses[_id][1].append(doc)
    fr.close()
    for [_id,[posts,responses]] in id2postresponses.items():
        if len(posts) == 1 and len(responses) > 0:
            ret.append([posts,responses])
    return ret

def CompressPairCorpus(corpus,vocabSize):
    ret = []
    word2count,phrase2count = {},{}
    for posts,responses in corpus:
        for doc in posts+responses:
            for token in doc:
                if '_' in token:
                    if not token in phrase2count:
                        phrase2count[token] = 0
                    phrase2count[token] += 1
                else:
                    if not token in word2count:
                        word2count[token] = 0
                    word2count[token] += 1
    word_count = sorted(word2count.items(),key=lambda x:-x[1])
    phrase_count = sorted(phrase2count.items(),key=lambda x:-x[1])
    token2count = {}
    for token,count in word_count[:min(len(word_count),vocabSize)]:
        token2count[token] = count
    for token,count in phrase_count[:min(len(phrase_count),vocabSize)]:
        token2count[token] = count
    token_count = sorted(token2count.items(),key=lambda x:-x[1])
    vocab = set([token for token,count in token_count])
    for posts,responses in corpus:
        _posts,_responses = [],[]
        for doc in posts:
            _doc = []
            for token in doc:
                if token in vocab:
                    _doc.append(token)
            if len(_doc) > 0: 
                _posts.append(_doc)
        for doc in responses:
            _doc = []
            for token in doc:
                if token in vocab:
                    _doc.append(token)
            if len(_doc) > 0: 
                _responses.append(_doc)
        if len(_posts) == 1 and len(_responses) > 0:
            ret.append([_posts,_responses])
    return ret

def LDA(filename,corpusRaw,nTopics):
    import numpy as np
    from bayespy import nodes
    from bayespy.inference.vmp.nodes.categorical import CategoricalMoments
    from bayespy.inference import VB
    ### Constant ###
    nIters,nTop = 1000,300
    subsetSize,delay,forgettingRate = 1000,1,0.7
    ### Data ###
    nDocuments = len(corpusRaw)
    documents,corpus,vocabulary = [],[],[[],{}]
    for idDoc in range(nDocuments):
        for token in corpusRaw[idDoc]:
            if not token in vocabulary[1]:
                vocabulary[1][token] = len(vocabulary[0])       
                vocabulary[0].append(token)
            idToken = vocabulary[1][token]
            documents.append(idDoc)
            corpus.append(idToken)
    nWords = len(documents)
    nVocabulary = len(vocabulary[0])
    print('#docs',nDocuments)    
    print('#words',nWords)
    print('#vocab',nVocabulary)
    documents = np.array(documents)
    corpus = np.array(corpus)
    ### Variational Inference ###
    platesMultiplier = int(nWords/subsetSize)
    _p_topic_ = nodes.Dirichlet(np.ones(nTopics),plates=(nDocuments,),name='p_topic')
    _p_word_ = nodes.Dirichlet(np.ones(nVocabulary),plates=(nTopics,),name='p_word')
    _document_indices_ = nodes.Constant(CategoricalMoments(nDocuments),documents[:subsetSize],name='document_indices')
    _topics_ = nodes.Categorical(nodes.Gate(_document_indices_,_p_topic_),plates=(subsetSize,),
            plates_multiplier=(platesMultiplier,),name='topics')
    _words_ = nodes.Categorical(nodes.Gate(_topics_,_p_word_),name='words')
    _p_topic_.initialize_from_random()
    _p_word_.initialize_from_random()
    Q = VB(_words_,_topics_,_p_word_,_p_topic_,_document_indices_)
    Q.ignore_bound_checks = True
    for n in range(nIters):
        subset = np.random.choice(nWords,subsetSize)
        Q['words'].observe(corpus[subset])
        Q['document_indices'].set_value(documents[subset])
        Q.update('topics')
        step = (n+delay)**(-forgettingRate)
        Q.gradient_step('p_topic','p_word',scale=step)
    ### Output ###
    fw = open(filename+'_topic_tokens.txt','w')
    p_word_parameters = Q['p_word'].get_parameters()
    for idTopic in range(nTopics):
        word2score,phrase2score = {},{}
        for idToken in range(nVocabulary):
            token = vocabulary[0][idToken]
            score = p_word_parameters[0][idTopic][idToken]
            if '_' in token:
                phrase2score[token] = score
            else:
                word2score[token] = score
        word_score = sorted(word2score.items(),key=lambda x:-x[1])
        phrase_score = sorted(phrase2score.items(),key=lambda x:-x[1])
        token2score = {}
        for token,score in word_score[:min(len(word_score),nTop)]:
            token2score[token] = score
        for token,score in phrase_score[:min(len(phrase_score),nTop)]:
            token2score[token] = score
        token_score = sorted(token2score.items(),key=lambda x:-x[1])
        for token,score in token_score:
            fw.write(str(idTopic)+'\t'+token+'\t'+str(score)+'\n')
    fw.close() 
    fw = open(filename+'_doc_topic.txt','w')
    p_topic_parameters = Q['p_topic'].get_parameters()
    for idDocument in range(nDocuments):
        s = ''
        for idTopic in range(nTopics):
            score = p_topic_parameters[0][idDocument][idTopic]
            if score > 0:
                s += ' '+str(idTopic)+':'+str(score)
        fw.write(s[1:]+'\n')
    fw.close() 

def PairLDA(filename,corpusPR,nTopicsP,nTopicsR):
    import numpy as np
    from bayespy import nodes
    from bayespy.inference.vmp.nodes.categorical import CategoricalMoments
    from bayespy.inference import VB
    nTopics = nTopicsR
    ### Constant ###
    nIters,nTop = 1000,300
    subsetSize,delay,forgettingRate = 1000,1,0.7
    ### Data ###
    nDocuments = len(corpusPR)
    documents,corpus,vocabulary = [],[],[[],{}]
    for idDoc in range(nDocuments):
        posts,responses = corpusPR[idDoc]
        for response in responses:
            for token in response:
                if not token in vocabulary[1]:
                    vocabulary[1][token] = len(vocabulary[0])       
                    vocabulary[0].append(token)
                idToken = vocabulary[1][token]
                documents.append(idDoc)
                corpus.append(idToken)
    nWords = len(documents)
    nVocabulary = len(vocabulary[0])
    print('#docs',nDocuments)
    print('#words',nWords)
    print('#vocab',nVocabulary)
    documents = np.array(documents)
    corpus = np.array(corpus)
    ### Variational Inference ###
    platesMultiplier = int(nWords/subsetSize)
    _p_topic_ = nodes.Dirichlet(np.ones(nTopics),plates=(nDocuments,),name='p_topic')
    _p_word_ = nodes.Dirichlet(np.ones(nVocabulary),plates=(nTopics,),name='p_word')
    _document_indices_ = nodes.Constant(CategoricalMoments(nDocuments),documents[:subsetSize],name='document_indices')
    _topics_ = nodes.Categorical(nodes.Gate(_document_indices_,_p_topic_),plates=(subsetSize,),
            plates_multiplier=(platesMultiplier,),name='topics')
    _words_ = nodes.Categorical(nodes.Gate(_topics_,_p_word_),name='words')
    _p_topic_.initialize_from_random()
    _p_word_.initialize_from_random()
    Q = VB(_words_,_topics_,_p_word_,_p_topic_,_document_indices_)
    Q.ignore_bound_checks = True
    for n in range(nIters):
        subset = np.random.choice(nWords,subsetSize)
        Q['words'].observe(corpus[subset])
        Q['document_indices'].set_value(documents[subset])
        Q.update('topics')
        step = (n+delay)**(-forgettingRate)
        Q.gradient_step('p_topic','p_word',scale=step)
    ### Output ###
    fw = open(filename+'_response_topic_tokens.txt','w')
    p_word_parameters = Q['p_word'].get_parameters()
    for idTopic in range(nTopics):
        word2score,phrase2score = {},{}
        for idToken in range(nVocabulary):
            token = vocabulary[0][idToken]
            score = p_word_parameters[0][idTopic][idToken]
            if '_' in token:
                phrase2score[token] = score
            else:
                word2score[token] = score
        word_score = sorted(word2score.items(),key=lambda x:-x[1])
        phrase_score = sorted(phrase2score.items(),key=lambda x:-x[1])
        token2score = {}
        for token,score in word_score[:min(len(word_score),nTop)]:
            token2score[token] = score
        for token,score in phrase_score[:min(len(phrase_score),nTop)]:
            token2score[token] = score
        token_score = sorted(token2score.items(),key=lambda x:-x[1])
        for token,score in token_score:
            fw.write(str(idTopic)+'\t'+token+'\t'+str(score)+'\n')
    fw.close() 
    fw = open(filename+'_post_doc_topic.txt','w')
    p_topic_parameters = Q['p_topic'].get_parameters()
    for idDocument in range(nDocuments):
        s = ''
        for idTopic in range(nTopics):
            score = p_topic_parameters[0][idDocument][idTopic]
            if score > 0:
                s += ' '+str(idTopic)+':'+str(score)
        fw.write(s[1:]+'\n')
    fw.close()


#Preprocess()

#corpusPostWord = LoadCorpus(True,False,True,False)
#print('corpusPostWord',len(corpusPostWord))
#corpusPostWord = CompressCorpus(corpusPostWord,3000)
#LDA('lda_post_5_word',corpusPostWord,5)

#corpusResponseWord = LoadCorpus(False,True,True,False)
#print('corpusResponseWord',len(corpusResponseWord))
#corpusResponseWord = CompressCorpus(corpusResponseWord,3000)
#LDA('lda_response_10_word',corpusResponseWord,10)

#corpusPostPhrase = LoadCorpus(True,False,True,True)
#print('corpusPostPhrase',len(corpusPostPhrase))
#corpusPostPhrase = CompressCorpus(corpusPostPhrase,3000)
#LDA('lda_post_5_phrase',corpusPostPhrase,5)

#corpusResponsePhrase = LoadCorpus(False,True,True,True)
#print('corpusResponsePhrase',len(corpusResponsePhrase))
#corpusResponsePhrase = CompressCorpus(corpusResponsePhrase,3000)
#LDA('lda_response_10_phrase',corpusResponsePhrase,10)

#pairCorpusWord = LoadPairCorpus(True,False)
#print(len(pairCorpusWord))

#pairCorpusPhrase = LoadPairCorpus(True,True)
#print(len(pairCorpusPhrase))
#pairCorpusPhrase = CompressPairCorpus(pairCorpusPhrase,3000)
#PairLDA('pairlda_5_10_phrase',pairCorpusPhrase,5,10)



